This is an R Markdown
Notebook. When you execute code within the notebook, the results appear
beneath the code.
Try executing this chunk by clicking the Run button within
the chunk or by placing your cursor inside it and pressing
Cmd+Shift+Enter.
Add a new chunk by clicking the Insert Chunk button on the
toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and
output will be saved alongside it (click the Preview button or
press Cmd+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the
editor. Consequently, unlike Knit, Preview does not
run any R code chunks. Instead, the output of the chunk when it was last
run in the editor is displayed.
Import necessary libraries
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
Importing necesary CSV files
#Setting working directory
setwd("/Users/tammyray/Desktop/Aaron_16p_Imaging_Transcriptomics/CSV_data_sheets/transcriptomic_datasheets")
Warning: The working directory was changed to /Users/tammyray/Desktop/Aaron_16p_Imaging_Transcriptomics/CSV_data_sheets/transcriptomic_datasheets inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
#Importing trancriptomic datasdets
Talkowski2014 <- read.csv("Talkowski_2014_TableS3_Human_Del.csv")
Urresti2021_3M_organoid <- read.csv("Urresti-2021_DEG-3M-cortical-organoid.csv")
Roth2020 <- read.csv("Roth_2020_DEG-hIPSC-neural-endodermMesoderm.csv")
#Importing file containing 16p11.2 gene list
chr16p11.2_gene_list_Kusenda2015 <- read.csv("Kusenda_2015_TableS1_16p11.2_gene_list.csv")
#Creating a vector of all genes in the 16p11.2 chromosomal region
sixteenP11.2_genes <- chr16p11.2_gene_list_Kusenda2015$Gene.symbol
Modifying dataframes, such that they have the same column names, in
order to combine them
- Uresti 2021 cortical organoids data
Urresti2021_3M_organoid_tidy <- Urresti2021_3M_organoid %>%
#remove columns that i do not need
select(-c("Gene.Biotype", "log2.FC..DUP.vs.DEL", "P.value..DUP.vs.DEL", "FDR.adjusted.P.value..DUP.vs.DEL", "log2.FC..DUP.vs.CTL",
"P.value..DUP.vs.CTL", "FDR.adjusted.P.value..DUP.vs.CTL", "FDR.adjusted.P.value..DEL.vs.CTL")) %>%
#rename columns to the standardized names I decided on
rename("gene_name" = "HGNC.Symbol",
"log2_fold_change" = "log2.FC..DEL.vs.CTL",
"p_value" = "P.value..DEL.vs.CTL",
"gene_description" = "Description",
"ENSEMBL_ID" = "ENSEMBL.ID") %>%
#add a column for the data source
mutate( "sample_type" = "cortical organoid", "data_source" = "Urresti et al. 2021", "Present.in.the.16p11.2.Region" = "") %>%
#reorder columns so that the ensemble_ID column, which not all dataframes will have, is at the end
relocate("ENSEMBL_ID", .after = "data_source")
- Roth 2020 data
Roth2020_tidy <- Roth2020 %>%
# remove columns that I do not need
select(-c("SFARI.Gene.Score", "SFARI.Gene.Score.Category", "https...gene.sfari.org.about.gene.scoring.criteria.")) %>%
#rename columns to the standardized names I decided on
rename("gene_name" = "X16p11.2.DE.Genes",
"log2_fold_change" = "Log2.Fold.Change..Positive.values.are.upregulated.in.DEL.",
"p_value" = "Adjusted.P.Value") %>%
#add columns for the data source, sample type, and extras to match the above dataset
mutate("gene_description" = "", "sample_type" = "hIPSC", "data_source" = "Roth et al. 2020", "ENSEMBL_ID" = "") %>%
#reorder columns to match the other dataframes
relocate("gene_description", .after = "gene_name") %>%
relocate("Present.in.the.16p11.2.Region", .after = "ENSEMBL_ID")
- Talkowski 2014 data
Talkowski2014_tidy <- Talkowski2014 %>%
select(-c("GeneInfo", "Chr", "Start", "Stop")) %>%
#select(-c("GeneInfo", "Chr", "Start", "Stop")) %>%
#Transforming the foldchange to log2 foldchange
mutate("FoldChange" = log2(FoldChange)) %>%
#rename columns to the standardized names I decided on
rename("gene_name" = "GeneID",
"log2_fold_change" = "FoldChange",
"p_value" = "permPval_Del") %>%
#add columns for the data source, sample type, and extras to match the above dataset
mutate("gene_description" = "", "sample_type" = "hLCL", "data_source" = "Talkowski et al. 2014", "ENSEMBL_ID" = "", "Present.in.the.16p11.2.Region" = "") %>%
#reorder columns to match the other dataframes
relocate("gene_description", .after = "gene_name") %>%
relocate("p_value", .after = "log2_fold_change")
Potential next step: I tried to add in the ensembl ID for the
Talkowski 2014 data, but that dataset used an old refrence transcriptome
and so, using the chromosomal start and stop positions and chromosome
number, I was unable to find the ensembl ID for the genes in that
dataset. I
Now, I want to fill in the “Present.in.the.16p11.2.Region” column for
the DEG_all_timepoints_organoid_Urresti2021_modified_columns
dataframe.
# The vector sixteenP11.2_genes is a vector containing all genes located in the 16p11.2 chromosomal region
# I want to change the value of the "Present.in.the.16p11.2.Region" column to "Yes" if the gene name is in the vector, and "No" if it is not
Urresti2021_3M_organoid_tidy$Present.in.the.16p11.2.Region <-
ifelse(Urresti2021_3M_organoid_tidy$gene_name %in% sixteenP11.2_genes, "Yes", "No")
Roth2020_tidy$Present.in.the.16p11.2.Region <-
ifelse(Roth2020_tidy$gene_name %in% sixteenP11.2_genes, "Yes", "No")
Talkowski2014_tidy$Present.in.the.16p11.2.Region <-
ifelse(Talkowski2014_tidy$gene_name %in% sixteenP11.2_genes, "Yes", "No")
Now I combine all 3 organized dataframes into 1 dataframe
all_DEG_data <- rbind(Urresti2021_3M_organoid_tidy, Roth2020_tidy, Talkowski2014_tidy)
# Now, I want to look at the genes in the 16p11.2 region only, from the all DEG data. So I will filter the all_DEG_data dataframe to only include those genes
all_DEG_16p_genes <- all_DEG_data %>%
filter(Present.in.the.16p11.2.Region == "Yes")
No I will do a metaanalysis to combine every instance of a gene into
one row, so that I can see all the data for each gene in one row Log
fold change is weighted by p-value p values are combined with fishers
exact test
meta_analysis_all_DEG_data <- all_DEG_data %>%
group_by(gene_name) %>%
summarize(combined_log2_fold_change = sum(log2_fold_change / (p_value + 1e-8)) / sum(1 / (p_value + 1e-8)), # Weighted logFC using p-values.
# A small constant (1e-8) is added to avoid division by zero if there are very small p-values.
#in the next line of code I will do a fisher test to combine the p-values
combined_p_value = 1 - pchisq(sum(qchisq(1 - p_value, 1, lower.tail = FALSE)), length(p_value), lower.tail = FALSE)) %>%
#the above code will return a dataframe that only contains 3 of the columns. In the next line of code I will bring the rest of the columns back to this new dataframe
left_join(all_DEG_data, by = "gene_name")
#Keep track of if the gene combined logfoldcahnge and p-value are from multiple data sources
meta_analysis_all_DEG_data$multiple_data_sources <- ifelse(duplicated(meta_analysis_all_DEG_data$gene_name) | duplicated(meta_analysis_all_DEG_data$gene_name, fromLast = TRUE), "Yes", "No")
#Now, I want to delete rows that have the same gene name and remove the columns that have the original log2_fold_change, p_value, sample_type, and data_source
meta_analysis_all_DEG_data <- meta_analysis_all_DEG_data %>%
distinct(gene_name, .keep_all = TRUE) %>%
select(-c(log2_fold_change, p_value, sample_type, data_source))
#This code adds a column called significance that will indicate the degree of significance, just like Smrithi's code
meta_analysis_all_DEG_data <- meta_analysis_all_DEG_data %>%
mutate("significance" = case_when(
combined_p_value < 0.001 ~ "***",
combined_p_value < 0.01 ~ "**",
combined_p_value < 0.05 ~ "*",
TRUE ~ " "
))
#Now i will create a new dataframe with only the genes that are significantly differentially expressed at combined p-value < 0.001
sig_genes_p0.001_DEG_data <- meta_analysis_all_DEG_data %>%
filter(significance == "***")
Exporting the dataframes to CSV files, if desired
setwd("/Users/tammyray/Desktop/aaron_lab_2024/output")
write.csv(Urresti2021_3M_organoid_tidy, file="Urresti2021_3M_organoid_tidy.csv")
write.csv(Roth2020_tidy, file="Roth2020_tidy.csv")
write.csv(Talkowski2014_tidy, file="Talkowski2014_tidy.csv")
write.csv(all_DEG_data, file="all_DEG_data.csv")
write.csv(meta_analysis_all_DEG_data, file="meta_analysis_all_DEG_data.csv")
write.csv(sig_genes_p0.001_DEG_data, file="sig_genes_p0.001_DEG_data.csv")
Gene ontology analysis I am using gprofiler2 to do my GO analysis
Loading gprofiler2 package
library(gprofiler2)
Doing GO analysis
up_reg_genes <- ssig_genes_p0.001_DEG_data %>% filter(combined_log2_fold_change > 0)
Error: object 'ssig_genes_p0.001_DEG_data' not found
Exporting the GO analysis dataframes to CSV files, if desired
setwd("/Users/tammyray/Desktop/aaron_lab_2024/output")
write.csv(gostres_results, file="/Users/tammyray/Desktop/Aaron_16p_Imaging_Transcriptomics/output/GO_16p11.2-DEGs_p0.001_fdr-corrections.csv", row.names = FALSE)
NOW, we generate map of our significant genes to AHBA spatial map
First, read in the data
AHBA <- read_tsv("/Users/tammyray/Desktop/Aaron_16p_Imaging_Transcriptomics/CSV_data_sheets/AllenHBA_DK_ExpressionMatrix.tsv")
New names:
• `` -> `...1`
Rows: 20736 Columns: 70
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): ...1
dbl (69): Average donor correlation to median, ctx-rh-insula, ctx-lh-paracentral, ctx-rh-precuneus, ctx-lh-me...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Now, I want to cchange the column header of the first column of the AHBA dataframe to gene_symbol
colnames(AHBA)[1] <- "gene_symbol"
Mapping our significant genes to the AHBA dataset
#Creating vector of gene names in our experimental groups
sig_genes_p0.001_DEG_data #dataframe containing all significant genes
sig_gene_list <- sig_genes_p0.001_DEG_data$gene_name #vector of all significant genes names
sixteenP11.2_genes #dataframe containing all genes in 16p11.2 locus
sixteen_p_11.2_genes <- chr16p11.2_gene_list_Kusenda2015$Gene.symbol #vector of all genes in 16p locus
AHBA_sig_genes <- AHBA %>% filter(gene_symbol %in% sig_gene_list)
#NOTE: not all sig genes mapped to AHBA dataset. Here is a list of them so I can troubleshoot later
genes_not_mapped <- sig_gene_list[!(sig_gene_list %in% AHBA_sig_genes$gene_symbol)]
NOTE: Only 87 of the 132 genes mapped to the AHBA dataset
Tidying the AHBA subset dataframe with significant genes only
AHBA_sig_genes_pivot <- AHBA_sig_genes %>%
#remove the "Average donor correlation to the median" column
select(-"Average donor correlation to median") %>%
pivot_longer(cols = -gene_symbol, names_to = "region", values_to = "expression") %>%
#adding a column for if gene is in the 16p region
mutate(in_16p_region = ifelse(gene_symbol %in% sixteen_p_11.2_genes, "Yes", "No")) %>%
# Adding a column for if gene is up or downregulated
mutate(upreg = ifelse(gene_symbol %in% up_reg_gene_names, "Yes", "No"))
GGplot to visualize the data
# Visualizing expression data of significant genes
ggplot(AHBA_sig_genes_pivot, aes(x=expression, y=region, shape=in_16p_region, color=gene_symbol)) +
geom_point() +
labs(title = "regional expression of significant genes from 16p del meta-analysis", x = "Raw Expression", y = "Brain Region") +
theme_minimal() +
#I will make the legend smaller and put it underneath the graph because it is too big
theme(legend.position = "bottom") +
guides(color="none")

# same plot but aesthetics only for genes in 16p region
ggplot(AHBA_sig_genes_pivot, aes(x=expression, y=region, color=in_16p_region)) +
geom_point() +
labs(title = "regional expression of significant genes from 16p del meta-analysis", x = "Raw Expression", y = "Brain Region") +
theme_minimal() +
#I will make the legend smaller and put it underneath the graph because it is too big
theme(legend.position = "bottom")

ggplot(AHBA_sig_genes_pivot, aes(x=expression, y=region, color=upreg)) +
geom_point() +
labs(title = "regional expression of significant genes from 16p del meta-analysis", x = "Raw Expression", y = "Brain Region") +
theme_minimal() +
#I will make the legend smaller and put it underneath the graph because it is too big
theme(legend.position = "bottom")

Now, we want to work with the normalized AHBA values. They are in a
matlab file, so first I will need to convert the matlab file to
something R can read, with the R.matlab package. NOTE: The R.matlab
package does not keep column names of the original data when converting
things, so I had to add them back in.
First, load libraries
library(R.matlab)
R.matlab v3.7.0 (2022-08-25 21:52:34 UTC) successfully loaded. See ?R.matlab for help.
Attaching package: ‘R.matlab’
The following objects are masked from ‘package:base’:
getOption, isOpen
Second, load in relevant files
setwd("/Users/tammyray/Desktop/Aaron_16p_Imaging_Transcriptomics")
AHBA_normed <- readMat("ROIxGene_aparcaseg_INT.mat")
setwd("/Users/tammyray/Desktop/Aaron_16p_Imaging_Transcriptomics/CSV_data_sheets")
region_array <- read_csv("Region_labels_for_aparcaseg_parcellation.csv", col_names = FALSE)
Tidying up the AHBA_normed dataframe
#Yay now data table is as i want it to be!
# Now I will pivot the table so that it matches the AHBA_sig_genes_all_pivot table
AHBA_normed_pivot <- AHBA_parcel_expression0_labeled %>%
pivot_longer(cols = -region, names_to = "gene_symbol", values_to = "expression") %>%
mutate(in_16p_region = ifelse(gene_symbol %in% sixteen_p_11.2_genes, "Yes", "No")) %>%
unnest(cols = region)
AHBA_normed_pivot_sigONLY <- AHBA_normed_pivot %>%
filter(gene_symbol %in% gene_list_all)
Error in `filter()`:
ℹ In argument: `gene_symbol %in% gene_list_all`.
Caused by error:
! object 'gene_list_all' not found
Backtrace:
1. AHBA_normed_pivot %>% filter(gene_symbol %in% gene_list_all)
9. gene_symbol %in% gene_list_all
Exporting dataframes as CSVs
# OK, Now that I have this data, I want to put it into a CSV so I have it for future reference
setwd("/Users/tammyray/Desktop/aaron_lab_2024/output")
write.csv(AHBA_normed_pivot, "Normed_AHBA_sig_genes.csv")
write.csv(AHBA_normed_pivot_sigONLY, "Normed_AHBA_sig_genes_ONLY.csv")
write.csv(AHBA_sig_genes_all_pivot_ONLY, "Raw_AHBA_sig_genes_ONLY.csv")
Plotting the normalized AHBA data
ggplot(AHBA_normed_pivot_sigONLY, aes(x=expression, y=region, color=gene_symbol)) +
geom_point() +
labs(title = "NORMALIZED regional expression of significant genes from 16p del meta-analysis", x = "Normalized Expression", y = "Brain Region") +
theme_minimal() +
#I will remove the legend because it is too big
theme(legend.position = "none")

# Visualizing expression data of significant genes
ggplot(AHBA_normed_pivot_sigONLY, aes(x=expression, y=region, shape=in_16p_region, color=gene_symbol)) +
geom_point() +
labs(title = "NORMALIZED regional expression of significant genes from 16p del meta-analysis", x = "Normalized Expression", y = "Brain Region") +
theme_minimal() +
#I will make the legend smaller and put it underneath the graph because it is too big
theme(legend.position = "bottom") +
guides(color="none")

# same plot but aesthetics only for genes in 16p region
ggplot(AHBA_normed_pivot_sigONLY, aes(x=expression, y=region, color=in_16p_region)) +
geom_point() +
labs(title = "NORMALIZED regional expression of significant genes from 16p del meta-analysis", x = "Normalized Expression", y = "Brain Region") +
theme_minimal() +
#I will make the legend smaller and put it underneath the graph because it is too big
theme(legend.position = "bottom")

Now I will map the normalized AHBA data to Smrithi’s MRI data, which
is in a csv file: 16panalysis_BH_adjusted.csv
Importing necessary files
# Importing Smrithi's MRI dataset
setwd("/Users/tammyray/Desktop/aaron_lab_2024/output")
Warning: The working directory was changed to /Users/tammyray/Desktop/aaron_lab_2024/output inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
MRI <- read_csv("16panalysis_BH_adjusted.csv")
Rows: 1224 Columns: 10
── Column specification ────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (4): Genotype, Model, VolumetricComponent, Significance
dbl (6): Estimate, Std. Error, t value, Pr(>|t|), SigPos, adjusted_p
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Outside of R, I have made a CSV of corresponding AHBA and Smrithi MRI data region names. First column is AHBA names, second column is MRI names.
# Importing the data
setwd("/Users/tammyray/Desktop/aaron_lab_2024/CSV_data_sheets")
standard_region_names <- read_csv("AHBA_MRI_region_names.csv")
Rows: 34 Columns: 2
── Column specification ────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): AHBA regions, left hemisphere Smrithi regions
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Remembering other files I will use in the following chunks
sixteen_p_11.2_genes #vector of all genes in 16p locus
[1] "SLC7A5P1" "SPN" "QPRT" "C16orf54" "ZG16" "KIF22" "MAZ"
[8] "PRRT2" "C16orf53" "MVP" "CDIPT" "LOC440356" "SEZ6L2" "ASPHD1"
[15] "KCTD13" "TMEM219" "TAOK2" "HIRIP3" "INO80E" "DOC2A" "C16orf92"
[22] "FAM57B" "ALDOA" "PPP4C" "TBX6" "YPEL3" "GDPD3" "MAPK3"
[29] "CORO1A" "BOLA2" "SLX1A" "SULT1A3" "IMMA" "SMG1 "
Tidying Smrithi’s MRI data
# We decided that we only want to use model 3, Spline TCV model (decided w Smrithi meeting 8/24/24)
# We are only using the deletion data for now
MRI_model3 <- MRI %>%
filter(Genotype == "Genotype16pDeletion...5")
#MRI regions to remove. I made this list manually using google sheets and reformatted it for R using chatgpt
MRI_regions_to_remove <- c("L.Cerebral_WM", "L.cerebral_cortex", "L.lateral_ventricle", "L.inferior_lateral_ventricle", "L.cerebellum_WM", "L.cerebellum_cortex",
"L.thalamus", "L.caudate", "L.putamen", "L.pallidum", "L.accumbens", "L.hippocampus", "L.amygdala", "L.ventral_DC", "Third_Ventricle",
"Fourth_Ventricle", "Brainstem", "CSF", "R.Cerebral_WM", "R.cerebral_cortex", "R.lateral_ventricle", "R.inferior_lateral_ventricle",
"R.cerebellum_WM", "R.cerebellum_cortex", "R.thalamus", "R.caudate", "R.putamen", "R.pallidum", "R.hippocampus", "R.amygdala", "R.accumbens",
"R.ventral_DC", "R.ctx_bankssts", "R.ctx_caudalanteriorcingulate", "R.ctx_caudalmiddlefrontal", "R.ctx_cuneus", "R.ctx_entorhinal", "R.ctx_fusiform",
"R.ctx_inferiorparietal", "R.ctx_inferiortemporal", "R.ctx_isthmuscingulate", "R.ctx_lateraloccipital", "R.ctx_lateralorbitofrontal", "R.ctx_lingual",
"R.ctx_medialorbitofrontal", "R.ctx_middletemporal", "R.ctx_parahippocampal", "R.ctx_paracentral", "R.ctx_parsopercularis", "R.ctx_parsorbitalis",
"R.ctx_parstriangularis", "R.ctx_pericalcarine", "R.ctx_postcentral", "R.ctx_posteriorcingulate", "R.ctx_precentral", "R.ctx_precuneus",
"R.ctx_rostralanteriorcingulate", "R.ctx_rostralmiddlefrontal", "R.ctx_superiorfrontal", "R.ctx_superiorparietal", "R.ctx_superiortemporal",
"R.ctx_supramarginal", "R.ctx_frontalpole", "R.ctx_temporalpole", "R.ctx_transversetemporal", "R.ctx_insula", "cGMV", "WMV", "sGMV", "Ventricles",
"Cerebellum", "Accumbens", "Ventral_Diencephalon", "Pallidum", "Hippocampus", "Caudate", "Cerebral_White_Matter", "Lateral_Ventricle", "Cerebral_Cortex",
"Thalamus", "Putamen", "Amygdala", "Cerebellar_White_Matter", "Cerebellar_Cortex", "Inferior_Lateral_Ventricle", "ctx_bankssts", "ctx_caudalanteriorcingulate",
"ctx_caudalmiddlefrontal", "ctx_cuneus", "ctx_entorhinal", "ctx_fusiform", "ctx_inferiorparietal", "ctx_inferiortemporal", "ctx_isthmuscingulate",
"ctx_lateraloccipital", "ctx_lateralorbitofrontal", "ctx_lingual", "ctx_medialorbitofrontal", "ctx_middletemporal", "ctx_parahippocampal",
"ctx_paracentral", "ctx_parsopercularis", "ctx_parsorbitalis", "ctx_parstriangularis", "ctx_pericalcarine", "ctx_postcentral", "ctx_posteriorcingulate",
"ctx_precentral", "ctx_precuneus", "ctx_rostralanteriorcingulate", "ctx_rostralmiddlefrontal", "ctx_superiorfrontal", "ctx_superiorparietal",
"ctx_superiortemporal", "ctx_supramarginal", "ctx_frontalpole", "ctx_temporalpole", "ctx_transversetemporal", "ctx_insula")
# removing rows for the regions we aren't including
MRI_model3_leftHem <- MRI_model3 %>%
filter(VolumetricComponent %in% MRI_regions_to_remove == FALSE) %>%
rename(region = VolumetricComponent)
# Now, I want to use the standard region
standard_region_names <- standard_region_names %>%
rename(region = "left hemisphere Smrithi regions",
new_region = "AHBA regions")
# Use left_join to map the new names based on the region variable
MRI_model3_leftHem_new_regionNames <- MRI_model3_leftHem %>%
left_join(standard_region_names, by = "region") %>% # Join based on the region variable
mutate(region = ifelse(!is.na(new_region), new_region, region)) %>% # Replace region names with new ones
select(-new_region) # Remove the extra column if needed
Merging the AHBA normed values with Smrithi’s MRI data
AHBA_MRI_merged <- full_join(AHBA_normed_pivot_sigONLY, MRI_model3_leftHem_new_regionNames, by = "region", multiple = "all")
AHBA_MRI_merged_organized <- AHBA_MRI_merged %>%
select(-Genotype)
colnames(AHBA_MRI_merged_organized)[3] <- "AHBA_expression"
colnames(AHBA_MRI_merged_organized)[8] <- "t_value"
Exporting as csv AHBA_MRI_merged_organized
setwd("/Users/tammyray/Desktop/aaron_lab_2024/CSV_data_sheets")
write_csv(AHBA_MRI_merged_organized, "AHBA_MRI_merged.csv")
Pearson correlation analysis I want to do a correlation analysis
between the AHBA expression and the MRI data
Creating functions to compute correlation and p-value, created by
ChatGPT
correlation_with_pvalue_pearson <- function(x, y) {
test <- cor.test(x, y, method = "pearson")
return(data.frame(correlation = test$estimate, p_value = test$p.value))
}
correlation_with_pvalue_spearman <- function(x, y) {
test <- cor.test(x, y, method = "spearman")
return(data.frame(correlation = test$estimate, p_value = test$p.value))
}
Running the correlation analysis
# Group by gene_symbol and compute correlation between AHBA_expression and Estimate
cor_results_Estimate_pearson <- AHBA_MRI_merged_organized %>%
group_by(gene_symbol) %>%
summarize(correlation = cor(AHBA_expression, Estimate, method = "pearson"),
p_value = correlation_with_pvalue_pearson(AHBA_expression, Estimate)$p_value) %>%
mutate(in_16p_locus = ifelse(gene_symbol %in% sixteen_p_11.2_genes, "Yes", "No"))
#Trying a spearman correlation analysis instead
cor_results_Estimate_spearman <- AHBA_MRI_merged_organized %>%
group_by(gene_symbol) %>%
summarize(correlation = cor(AHBA_expression, Estimate, method = "spearman"),
p_value = cor.test(AHBA_expression, Estimate, method = "spearman")$p.value) %>%
mutate(in_16p_locus = ifelse(gene_symbol %in% sixteen_p_11.2_genes, "Yes", "No"))
# Doing the same thing for the t value
# Pearson correlation
cor_results_t_value_pearson <- AHBA_MRI_merged_organized %>%
group_by(gene_symbol) %>%
summarize(correlation = cor(AHBA_expression, t_value, method = "pearson"),
p_value = correlation_with_pvalue_pearson(AHBA_expression, t_value)$p_value) %>%
mutate(in_16p_locus = ifelse(gene_symbol %in% sixteen_p_11.2_genes, "Yes", "No"))
#Spearman correlation
cor_results_t_value_spearman <- AHBA_MRI_merged_organized %>%
group_by(gene_symbol) %>%
summarize(correlation = cor(AHBA_expression, t_value, method = "spearman"),
p_value = cor.test(AHBA_expression, t_value, method = "spearman")$p.value) %>%
mutate(in_16p_locus = ifelse(gene_symbol %in% sixteen_p_11.2_genes, "Yes", "No"))
Exporting the correlation analysis dataframes to CSV files
setwd("/Users/tammyray/Desktop/aaron_lab_2024/CSV_data_sheets")
write_csv(cor_results_Estimate_pearson, "AHBA_MRI_correlation_Estimate_Pearson.csv")
write_csv(cor_results_Estimate_spearman, "AHBA_MRI_correlation_Estimate_Spearman.csv")
write_csv(cor_results_t_value_pearson, "AHBA_MRI_correlation_t_value_Pearson.csv")
write_csv(cor_results_t_value_spearman, "AHBA_MRI_correlation_t_value_Spearman.csv")
Now, I want to visualize the correlation data
#Scatterplot showing relationship between p value and correlation, not super useful but educational for me!
ggplot(cor_results_t_value_spearman, aes(x=p_value, y=correlation, color=in_16p_locus)) +
geom_point() +
labs(title = "Spearman correlation of t-value",
x = "p value",
y = "correlation") +
theme_minimal() +
geom_vline(aes(xintercept = 0.05))

#making a density curve instead?
ggplot(cor_results_t_value_spearman, aes(x=correlation, color=in_16p_locus)) +
geom_density() +
labs(title = "Spearman correlation of t-value",
x = "Correlation Coefficient",
y = "Density") +
theme_minimal()

# making a histogram instead
ggplot(cor_results_t_value_spearman, aes(x= abs(correlation), fill=in_16p_locus)) +
geom_histogram(position = "identity", alpha = 0.6) +
labs(title = "Spearman correlation of t-value",
x = "Absolute Value of Correlation Coefficient",
y = "Frequency") +
theme_minimal() +
xlim(0, 0.6) + ylim(0, 8)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 4 rows containing missing values or values outside the scale range (`geom_bar()`).

ggplot(cor_results_Estimate_spearman, aes(x=abs(correlation), fill=in_16p_locus)) +
geom_histogram(position = "identity", alpha = 0.6) +
labs(title = "Spearman correlation of Estimate(~effect size)",
x = "Absolute Value of Correlation Coefficient",
y = "Frequency") +
theme_minimal() +
xlim(0, 0.6) + ylim(0, 8)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 4 rows containing missing values or values outside the scale range (`geom_bar()`).

Now I want to plot the data
library("ggpubr")
library(plotly)
Doing the above plotting for the 16p genes only
# Doing the above plotting for the NON 16p genes ----
AHBA_MRI_merged_organized_NOT_IN_16p_locus <- AHBA_MRI_merged_organized %>% filter(in_16p_region == "No")
scatter_plot_NOT_16p_genes <- ggplot(AHBA_MRI_merged_organized_NOT_IN_16p_locus, aes(x = AHBA_expression, y = Estimate, color = region)) +
geom_point(size = 0.1) +
geom_smooth(method = "lm", se = FALSE, color = "blue", linewidth = 0.5, fullrange = TRUE) + # Add a line of best fit (linear regression)
labs(title = "Scatter plots of AHBA Expression vs Estimate for Genes NOT in 16p locus",
x = "AHBA Expression",
y = "Estimate") +
facet_wrap(~ gene_symbol, scales = "free") + # Separate plots for each gene
theme_minimal() +
stat_cor(aes(label = paste(..r.label.., sep = "~`,`~")),
method = "pearson", label.x = 3, label.y = 0) + # Add correlation coefficients to the plot
xlim(0, 1) # Adjust axis limits
# Convert to a plotly object
interactive_plot_NOT_16p_genes <- ggplotly(scatter_plot_NOT_16p_genes, tooltip = c("text"))
# Display the interactive plot
interactive_plot_NOT_16p_genes
Adding correlation coefficiencies and p values to plots 1. Making new
dataframw with facet labels
# Merge main data with correlation results
plot_data_t_value_spearman <- AHBA_MRI_merged_organized %>%
left_join(cor_results_t_value_spearman, by = "gene_symbol") %>% # Merge by gene_symbol
mutate(facet_label = paste0(gene_symbol, "\n",
"| R: ", round(correlation, 2),
", p: ", round(p_value, 3)))
- replotting data
ggplot(plot_data_t_value_spearman, aes(x = AHBA_expression, y = t_value, color = region)) +
geom_point(size = 0.1) +
geom_smooth(method = "lm", se = FALSE, color = "blue", linewidth = 0.5, fullrange = TRUE) + # Add a line of best fit (linear regression)
labs(title = "Scatter plots of AHBA Expression vs t-value for all Genes",
x = "AHBA Expression",
y = "t-value") +
facet_wrap(~ facet_label, scales = "free") + # Use the custom facet label
theme_minimal() +
xlim(0, 1) #+ ylim(-0.7, 0.7) # Adjust axis limits
---
title: "16p11.2 CNV project"
output: html_notebook
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. 

Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Cmd+Option+I*.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Cmd+Shift+K* to preview the HTML file). 

The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike *Knit*, *Preview* does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.

Import necessary libraries
```{r}
library(tidyverse)


```

Importing necesary CSV files
```{r}
#Setting working directory
setwd("/Users/tammyray/Desktop/Aaron_16p_Imaging_Transcriptomics/CSV_data_sheets/transcriptomic_datasheets")
#Importing trancriptomic datasdets
Talkowski2014 <- read.csv("Talkowski_2014_TableS3_Human_Del.csv")
Urresti2021_3M_organoid <- read.csv("Urresti-2021_DEG-3M-cortical-organoid.csv")
Roth2020 <-  read.csv("Roth_2020_DEG-hIPSC-neural-endodermMesoderm.csv")
#Importing file containing 16p11.2 gene list
chr16p11.2_gene_list_Kusenda2015 <-  read.csv("Kusenda_2015_TableS1_16p11.2_gene_list.csv")
#Creating a vector of all genes in the 16p11.2 chromosomal region
sixteenP11.2_genes <- chr16p11.2_gene_list_Kusenda2015$Gene.symbol

```

Modifying dataframes, such that they have the same column names, in order to combine them

1. Uresti 2021 cortical organoids data
```{r}
Urresti2021_3M_organoid_tidy <- Urresti2021_3M_organoid %>%
  #remove columns that i do not need
  select(-c("Gene.Biotype", "log2.FC..DUP.vs.DEL", "P.value..DUP.vs.DEL", "FDR.adjusted.P.value..DUP.vs.DEL", "log2.FC..DUP.vs.CTL",
            "P.value..DUP.vs.CTL", "FDR.adjusted.P.value..DUP.vs.CTL", "FDR.adjusted.P.value..DEL.vs.CTL")) %>%
  #rename columns to the standardized names I decided on
  rename("gene_name" = "HGNC.Symbol",
         "log2_fold_change" = "log2.FC..DEL.vs.CTL",
         "p_value" = "P.value..DEL.vs.CTL",
         "gene_description" = "Description", 
         "ENSEMBL_ID" = "ENSEMBL.ID") %>%
  #add a column for the data source
  mutate( "sample_type" = "cortical organoid", "data_source" = "Urresti et al. 2021", "Present.in.the.16p11.2.Region" = "") %>%
  #reorder columns so that the ensemble_ID column, which not all dataframes will have, is at the end
  relocate("ENSEMBL_ID", .after = "data_source")

```

2. Roth 2020 data
```{r}
Roth2020_tidy <- Roth2020 %>%
  # remove columns that I do not need
  select(-c("SFARI.Gene.Score", "SFARI.Gene.Score.Category", "https...gene.sfari.org.about.gene.scoring.criteria.")) %>%
  #rename columns to the standardized names I decided on
  rename("gene_name" = "X16p11.2.DE.Genes",
         "log2_fold_change" = "Log2.Fold.Change..Positive.values.are.upregulated.in.DEL.",
         "p_value" = "Adjusted.P.Value") %>%
  #add columns for the data source, sample type, and extras to match the above dataset
  mutate("gene_description" = "", "sample_type" = "hIPSC", "data_source" = "Roth et al. 2020", "ENSEMBL_ID" = "") %>%
  #reorder columns to match the other dataframes
  relocate("gene_description", .after = "gene_name") %>%
  relocate("Present.in.the.16p11.2.Region", .after = "ENSEMBL_ID")

```

3. Talkowski 2014 data
```{r} 
Talkowski2014_tidy <- Talkowski2014 %>% 
  select(-c("GeneInfo", "Chr", "Start", "Stop")) %>%
  #select(-c("GeneInfo", "Chr", "Start", "Stop")) %>%
  #Transforming the foldchange to log2 foldchange
  mutate("FoldChange" = log2(FoldChange)) %>%
  #rename columns to the standardized names I decided on
  rename("gene_name" = "GeneID",
         "log2_fold_change" = "FoldChange",
         "p_value" = "permPval_Del") %>%
  #add columns for the data source, sample type, and extras to match the above dataset
  mutate("gene_description" = "", "sample_type" = "hLCL", "data_source" = "Talkowski et al. 2014", "ENSEMBL_ID" = "", "Present.in.the.16p11.2.Region" = "") %>%
  #reorder columns to match the other dataframes
  relocate("gene_description", .after = "gene_name") %>%
  relocate("p_value", .after = "log2_fold_change")

```

Potential next step: I tried to add in the ensembl ID for the Talkowski 2014 data, but that dataset used an old refrence transcriptome and so, using the 
chromosomal start and stop positions and chromosome number, I was unable to find the ensembl ID for the genes in that dataset. I




Now, I want to fill in the "Present.in.the.16p11.2.Region" column for the DEG_all_timepoints_organoid_Urresti2021_modified_columns dataframe. 
```{r}
# The vector sixteenP11.2_genes is a vector containing all genes located in the 16p11.2 chromosomal region
# I want to change the value of the "Present.in.the.16p11.2.Region" column to "Yes" if the gene name is in the vector, and "No" if it is not

Urresti2021_3M_organoid_tidy$Present.in.the.16p11.2.Region <-
  ifelse(Urresti2021_3M_organoid_tidy$gene_name %in% sixteenP11.2_genes, "Yes", "No")
Roth2020_tidy$Present.in.the.16p11.2.Region <-
  ifelse(Roth2020_tidy$gene_name %in% sixteenP11.2_genes, "Yes", "No")
Talkowski2014_tidy$Present.in.the.16p11.2.Region <-
  ifelse(Talkowski2014_tidy$gene_name %in% sixteenP11.2_genes, "Yes", "No")

```


Now I combine all 3 organized dataframes into 1 dataframe
```{r}
all_DEG_data <- rbind(Urresti2021_3M_organoid_tidy, Roth2020_tidy, Talkowski2014_tidy)

# Now, I want to look at the genes in the 16p11.2 region only, from the all DEG data. So I will filter the all_DEG_data dataframe to only include those genes
all_DEG_16p_genes <- all_DEG_data %>%
  filter(Present.in.the.16p11.2.Region == "Yes")

```


No I will do a metaanalysis to combine every instance of a gene into one row, so that I can see all the data for each gene in one row
Log fold change is weighted by p-value
p values are combined with fishers exact test
```{r}
meta_analysis_all_DEG_data <- all_DEG_data %>%
  group_by(gene_name) %>%
  summarize(combined_log2_fold_change = sum(log2_fold_change / (p_value + 1e-8)) / sum(1 / (p_value + 1e-8)),  # Weighted logFC using p-values.
            # A small constant (1e-8) is added to avoid division by zero if there are very small p-values.
            #in the next line of code I will do a fisher test to combine the p-values
            combined_p_value = 1 - pchisq(sum(qchisq(1 - p_value, 1, lower.tail = FALSE)), length(p_value), lower.tail = FALSE)) %>%
  #the above code will return a dataframe that only contains 3 of the columns. In the next line of code I will bring the rest of the columns back to this new dataframe
  left_join(all_DEG_data, by = "gene_name")


#Keep track of if the gene combined logfoldcahnge and p-value are from multiple data sources
meta_analysis_all_DEG_data$multiple_data_sources <- ifelse(duplicated(meta_analysis_all_DEG_data$gene_name) | duplicated(meta_analysis_all_DEG_data$gene_name, fromLast = TRUE), "Yes", "No")


#Now, I want to delete rows that have the same gene name and remove the columns that have the original log2_fold_change, p_value, sample_type, and data_source
meta_analysis_all_DEG_data <- meta_analysis_all_DEG_data %>%
  distinct(gene_name, .keep_all = TRUE) %>%
  select(-c(log2_fold_change, p_value, sample_type, data_source))

#This code adds a column called significance that will indicate the degree of significance, just like Smrithi's code
meta_analysis_all_DEG_data <- meta_analysis_all_DEG_data %>%
  mutate("significance" = case_when(
    combined_p_value < 0.001 ~ "***",
    combined_p_value < 0.01 ~ "**",
    combined_p_value < 0.05 ~ "*",
    TRUE ~ " "
  ))

#Now i will create a new dataframe with only the genes that are significantly differentially expressed at combined p-value < 0.001
sig_genes_p0.001_DEG_data <- meta_analysis_all_DEG_data %>%
  filter(significance == "***")

```

Exporting the dataframes to CSV files, if desired

```{r}
setwd("/Users/tammyray/Desktop/aaron_lab_2024/output")

write.csv(Urresti2021_3M_organoid_tidy, file="Urresti2021_3M_organoid_tidy.csv")
write.csv(Roth2020_tidy, file="Roth2020_tidy.csv")
write.csv(Talkowski2014_tidy, file="Talkowski2014_tidy.csv")
write.csv(all_DEG_data, file="all_DEG_data.csv")
write.csv(meta_analysis_all_DEG_data, file="meta_analysis_all_DEG_data.csv")
write.csv(sig_genes_p0.001_DEG_data, file="sig_genes_p0.001_DEG_data.csv")
```


Gene ontology analysis
I am using gprofiler2 to do my GO analysis
Loading gprofiler2 package
```{r}
library(gprofiler2)
```


Doing GO analysis
```{r}
# Making vector of genes in sig gene dataframe
exp_gene_list_1_p_0.001 <- sig_genes_p0.001_DEG_data$gene_name

# using gost function frmo gprofiler2 to do GO analysis
GO_results_exp_gene_list_p0.001 = gost(query = exp_gene_list_1_p_0.001,
               organism = "hsapiens",
               correction_method = "fdr")
GO_all_results_p0.001 <- GO_results_exp_gene_list_p0.001$result %>%
  select(-parents)

#putting metadata from GO analysis into a dataframe
GO_meta_p0.001 <- GO_results_exp_gene_list_p0.001$meta


#Creating gene lists for upregulated and downregulated genes
UPregulated_sig_genes_p0.001_DEG_data <- sig_genes_p0.001_DEG_data %>%
  filter(combined_log2_fold_change > 0)
UPregulated_exp_gene_list_1_p_0.001 <- UPregulated_sig_genes_p0.001_DEG_data$gene_name

DOWNregulated_sig_genes_p0.001_DEG_data <- sig_genes_p0.001_DEG_data %>%
  filter(combined_log2_fold_change < 0)
DOWNregulated_sig_genes_p0.001_DEG_data <- DOWNregulated_sig_genes_p0.001_DEG_data$gene_name

up_reg_genes <- sig_genes_p0.001_DEG_data %>% filter(combined_log2_fold_change > 0)
up_reg_gene_names <- up_reg_genes$gene_name

# GO analysis of upregulated and downregulated genes
gostres_UPregulated <- gost(query = UPregulated_exp_gene_list_1_p_0.001,
               organism = "hsapiens",
               correction_method = "fdr")

gostres_UPregulated <- gostres_UPregulated$result


gostres_DOWNregulated <- gost(query = DOWNregulated_sig_genes_p0.001_DEG_data,
                            organism = "hsapiens",
                            correction_method = "fdr")

gostres_DOWNregulated <- gostres_DOWNregulated$result
```

Exporting the GO analysis dataframes to CSV files, if desired
```{r}
setwd("/Users/tammyray/Desktop/aaron_lab_2024/output")
write.csv(gostres_results, file="/Users/tammyray/Desktop/Aaron_16p_Imaging_Transcriptomics/output/GO_16p11.2-DEGs_p0.001_fdr-corrections.csv", row.names = FALSE)


```


NOW, we generate map of our significant genes to AHBA spatial map


First, read in the data
```{r}
AHBA <- read_tsv("/Users/tammyray/Desktop/Aaron_16p_Imaging_Transcriptomics/CSV_data_sheets/AllenHBA_DK_ExpressionMatrix.tsv")
# Now, I want to cchange the column header of the first column of the AHBA dataframe to gene_symbol
colnames(AHBA)[1] <- "gene_symbol"

```


Mapping our significant genes to the AHBA dataset
```{r}
#Creating vector of gene names in our experimental groups
sig_genes_p0.001_DEG_data #dataframe containing all significant genes
sig_gene_list <- sig_genes_p0.001_DEG_data$gene_name #vector of all significant genes names
sixteenP11.2_genes #dataframe containing all genes in 16p11.2 locus
sixteen_p_11.2_genes <- chr16p11.2_gene_list_Kusenda2015$Gene.symbol #vector of all genes in 16p locus

AHBA_sig_genes <- AHBA %>% filter(gene_symbol %in% sig_gene_list)

#NOTE: not all sig genes mapped to AHBA dataset. Here is a list of them so I can troubleshoot later
genes_not_mapped <- sig_gene_list[!(sig_gene_list %in% AHBA_sig_genes$gene_symbol)]
```
NOTE: Only 87 of the 132 genes mapped to the AHBA dataset


Tidying the AHBA subset dataframe with significant genes only
```{r}
AHBA_sig_genes_pivot <- AHBA_sig_genes %>%
  #remove the "Average donor correlation to the median" column
  select(-"Average donor correlation to median") %>%
  pivot_longer(cols = -gene_symbol, names_to = "region", values_to = "expression") %>%
  #adding a column for if gene is in the 16p region
  mutate(in_16p_region = ifelse(gene_symbol %in% sixteen_p_11.2_genes, "Yes", "No")) %>%
  # Adding a column for if gene is up or downregulated
  mutate(upreg = ifelse(gene_symbol %in% up_reg_gene_names, "Yes", "No"))

```


GGplot to visualize the data
```{r}
# Visualizing expression data of significant genes
ggplot(AHBA_sig_genes_pivot, aes(x=expression, y=region, shape=in_16p_region, color=gene_symbol)) + 
  geom_point() + 
  labs(title = "regional expression of significant genes from 16p del meta-analysis", x = "Raw Expression", y = "Brain Region") +
  theme_minimal() +
  #I will make the legend smaller and put it underneath the graph  because it is too big
  theme(legend.position = "bottom") +
  guides(color="none")

# same plot but aesthetics only for genes in 16p region
ggplot(AHBA_sig_genes_pivot, aes(x=expression, y=region, color=in_16p_region)) + 
  geom_point() + 
  labs(title = "regional expression of significant genes from 16p del meta-analysis", x = "Raw Expression", y = "Brain Region") +
  theme_minimal() +
  #I will make the legend smaller and put it underneath the graph  because it is too big
  theme(legend.position = "bottom") 


ggplot(AHBA_sig_genes_pivot, aes(x=expression, y=region, color=upreg)) + 
  geom_point() + 
  labs(title = "regional expression of significant genes from 16p del meta-analysis", x = "Raw Expression", y = "Brain Region") +
  theme_minimal() +
  #I will make the legend smaller and put it underneath the graph  because it is too big
  theme(legend.position = "bottom") 
```


Now, we want to work with the normalized AHBA values. They are in a matlab file, so first I will need to convert the matlab file to something R can read, with the R.matlab package. 
NOTE: The R.matlab package does not keep column names of the original data when converting things, so I had to add them back in.

First, load libraries
```{r}
library(R.matlab)
```

Second, load in relevant files
```{r}
setwd("/Users/tammyray/Desktop/Aaron_16p_Imaging_Transcriptomics")
AHBA_normed <- readMat("ROIxGene_aparcaseg_INT.mat")

setwd("/Users/tammyray/Desktop/Aaron_16p_Imaging_Transcriptomics/CSV_data_sheets")
region_array <- read_csv("Region_labels_for_aparcaseg_parcellation.csv", col_names = FALSE)
```

Tidying up the AHBA_normed dataframe
```{r}
AHBA_parcel_expression0 <- as.data.frame(AHBA_normed$parcelExpression)
# Getting list of gene names 
array_gene_names <- unlist(AHBA_normed$probeInformation[[2]])
# First i need to edit my array_gene_names to add a 0 as the first item in my array
array_gene_names <- c(0, array_gene_names)
# Now, I want to rename the columns of the AHBA_parcel_expression dataframe to the gene names in array_gene_names
colnames(AHBA_parcel_expression0) <- array_gene_names

#Now, I want to change the values of column 1 of the AHBA_parcel_expression0 dataframe to the values in the region_array dataframe
#Maybe I just mutate the dataframe to add the array as a column, move that column to the front, and then remove the original column
AHBA_parcel_expression0_labeled <- AHBA_parcel_expression0 %>%
  mutate(region = region_array) %>%
  #Now i want to move the region$x1 var to be the first column
  select(region, everything()) %>%
  select(-"0")

#Yay now data table is as i want it to be!
# Now I will pivot the table so that it matches the AHBA_sig_genes_all_pivot table
AHBA_normed_pivot <- AHBA_parcel_expression0_labeled %>%
  pivot_longer(cols = -region, names_to = "gene_symbol", values_to = "expression") %>%
  mutate(in_16p_region = ifelse(gene_symbol %in% sixteen_p_11.2_genes, "Yes", "No")) %>%
  unnest(cols = region)



AHBA_normed_pivot_sigONLY <- AHBA_normed_pivot %>%
  filter(gene_symbol %in% sig_gene_list)


# Now, I want to change the column name X1 to region
colnames(AHBA_normed_pivot_sigONLY)[1] <- "region"

```

Exporting dataframes as CSVs
```{r}
# OK, Now that I have this data, I want to put it into a CSV so I have it for future reference
setwd("/Users/tammyray/Desktop/aaron_lab_2024/output")
write.csv(AHBA_normed_pivot, "Normed_AHBA_sig_genes.csv")
write.csv(AHBA_normed_pivot_sigONLY, "Normed_AHBA_sig_genes_ONLY.csv")
write.csv(AHBA_sig_genes_all_pivot_ONLY, "Raw_AHBA_sig_genes_ONLY.csv")
```


Plotting the normalized AHBA data
```{r}
ggplot(AHBA_normed_pivot_sigONLY, aes(x=expression, y=region, color=gene_symbol)) + 
  geom_point() + 
  labs(title = "NORMALIZED regional expression of significant genes from 16p del meta-analysis", x = "Normalized Expression", y = "Brain Region") +
  theme_minimal() +
  #I will remove the legend because it is too big
  theme(legend.position = "none")


# Visualizing expression data of significant genes
ggplot(AHBA_normed_pivot_sigONLY, aes(x=expression, y=region, shape=in_16p_region, color=gene_symbol)) + 
  geom_point() + 
  labs(title = "NORMALIZED regional expression of significant genes from 16p del meta-analysis", x = "Normalized Expression", y = "Brain Region") +
  theme_minimal() +
  #I will make the legend smaller and put it underneath the graph  because it is too big
  theme(legend.position = "bottom") +
  guides(color="none")

# same plot but aesthetics only for genes in 16p region
ggplot(AHBA_normed_pivot_sigONLY, aes(x=expression, y=region, color=in_16p_region)) + 
  geom_point() + 
  labs(title = "NORMALIZED regional expression of significant genes from 16p del meta-analysis", x = "Normalized Expression", y = "Brain Region") +
  theme_minimal() +
  #I will make the legend smaller and put it underneath the graph  because it is too big
  theme(legend.position = "bottom") 
```

Now I will map the normalized AHBA data to Smrithi's MRI data, which is in a csv file: 16panalysis_BH_adjusted.csv

Importing necessary files
```{r}
# Importing Smrithi's MRI dataset
setwd("/Users/tammyray/Desktop/aaron_lab_2024/output")
MRI <- read_csv("16panalysis_BH_adjusted.csv")

# Outside of R, I have made a CSV of corresponding AHBA and Smrithi MRI data region names. First column is AHBA names, second column is MRI names.
# Importing the data
setwd("/Users/tammyray/Desktop/aaron_lab_2024/CSV_data_sheets")
standard_region_names <- read_csv("AHBA_MRI_region_names.csv")

# Remembering other files I will use in the following chunks
sixteen_p_11.2_genes #vector of all genes in 16p locus
```


Tidying Smrithi's MRI data
```{r}
# We decided that we only want to use model 3, Spline TCV model (decided w Smrithi meeting 8/24/24)
# We are only using the deletion data for now
MRI_model3 <- MRI %>%
  filter(Genotype == "Genotype16pDeletion...5")

#MRI regions to remove. I made this list manually using google sheets and reformatted it for R using chatgpt
MRI_regions_to_remove <- c("L.Cerebral_WM", "L.cerebral_cortex", "L.lateral_ventricle", "L.inferior_lateral_ventricle", "L.cerebellum_WM", "L.cerebellum_cortex", 
                           "L.thalamus", "L.caudate", "L.putamen", "L.pallidum", "L.accumbens", "L.hippocampus", "L.amygdala", "L.ventral_DC", "Third_Ventricle", 
                           "Fourth_Ventricle", "Brainstem", "CSF", "R.Cerebral_WM", "R.cerebral_cortex", "R.lateral_ventricle", "R.inferior_lateral_ventricle", 
                           "R.cerebellum_WM", "R.cerebellum_cortex", "R.thalamus", "R.caudate", "R.putamen", "R.pallidum", "R.hippocampus", "R.amygdala", "R.accumbens", 
                           "R.ventral_DC", "R.ctx_bankssts", "R.ctx_caudalanteriorcingulate", "R.ctx_caudalmiddlefrontal", "R.ctx_cuneus", "R.ctx_entorhinal", "R.ctx_fusiform", 
                           "R.ctx_inferiorparietal", "R.ctx_inferiortemporal", "R.ctx_isthmuscingulate", "R.ctx_lateraloccipital", "R.ctx_lateralorbitofrontal", "R.ctx_lingual", 
                           "R.ctx_medialorbitofrontal", "R.ctx_middletemporal", "R.ctx_parahippocampal", "R.ctx_paracentral", "R.ctx_parsopercularis", "R.ctx_parsorbitalis", 
                           "R.ctx_parstriangularis", "R.ctx_pericalcarine", "R.ctx_postcentral", "R.ctx_posteriorcingulate", "R.ctx_precentral", "R.ctx_precuneus", 
                           "R.ctx_rostralanteriorcingulate", "R.ctx_rostralmiddlefrontal", "R.ctx_superiorfrontal", "R.ctx_superiorparietal", "R.ctx_superiortemporal", 
                           "R.ctx_supramarginal", "R.ctx_frontalpole", "R.ctx_temporalpole", "R.ctx_transversetemporal", "R.ctx_insula", "cGMV", "WMV", "sGMV", "Ventricles", 
                           "Cerebellum", "Accumbens", "Ventral_Diencephalon", "Pallidum", "Hippocampus", "Caudate", "Cerebral_White_Matter", "Lateral_Ventricle", "Cerebral_Cortex", 
                           "Thalamus", "Putamen", "Amygdala", "Cerebellar_White_Matter", "Cerebellar_Cortex", "Inferior_Lateral_Ventricle", "ctx_bankssts", "ctx_caudalanteriorcingulate", 
                           "ctx_caudalmiddlefrontal", "ctx_cuneus", "ctx_entorhinal", "ctx_fusiform", "ctx_inferiorparietal", "ctx_inferiortemporal", "ctx_isthmuscingulate", 
                           "ctx_lateraloccipital", "ctx_lateralorbitofrontal", "ctx_lingual", "ctx_medialorbitofrontal", "ctx_middletemporal", "ctx_parahippocampal", 
                           "ctx_paracentral", "ctx_parsopercularis", "ctx_parsorbitalis", "ctx_parstriangularis", "ctx_pericalcarine", "ctx_postcentral", "ctx_posteriorcingulate", 
                           "ctx_precentral", "ctx_precuneus", "ctx_rostralanteriorcingulate", "ctx_rostralmiddlefrontal", "ctx_superiorfrontal", "ctx_superiorparietal", 
                           "ctx_superiortemporal", "ctx_supramarginal", "ctx_frontalpole", "ctx_temporalpole", "ctx_transversetemporal", "ctx_insula")

# removing rows for the regions we aren't including
MRI_model3_leftHem <- MRI_model3 %>%
  filter(VolumetricComponent %in% MRI_regions_to_remove == FALSE) %>%
  rename(region = VolumetricComponent)

# Now, I want to use the standard region
standard_region_names <- standard_region_names %>%
  rename(region = "left hemisphere Smrithi regions",
         new_region = "AHBA regions")

# Use left_join to map the new names based on the region variable
MRI_model3_leftHem_new_regionNames <- MRI_model3_leftHem %>%
  left_join(standard_region_names, by = "region") %>%   # Join based on the region variable
  mutate(region = ifelse(!is.na(new_region), new_region, region)) %>%   # Replace region names with new ones
  select(-new_region)  # Remove the extra column if needed

```


Merging the AHBA normed values with Smrithi's MRI data
```{r}
AHBA_MRI_merged <- full_join(AHBA_normed_pivot_sigONLY, MRI_model3_leftHem_new_regionNames, by = "region", multiple = "all")

AHBA_MRI_merged_organized <- AHBA_MRI_merged %>%
  select(-Genotype)

colnames(AHBA_MRI_merged_organized)[3] <- "AHBA_expression"
colnames(AHBA_MRI_merged_organized)[8] <- "t_value"

```


Exporting as csv AHBA_MRI_merged_organized
```{r}
setwd("/Users/tammyray/Desktop/aaron_lab_2024/CSV_data_sheets")
write_csv(AHBA_MRI_merged_organized, "AHBA_MRI_merged.csv")
```

Pearson correlation analysis
I want to do a correlation analysis between the AHBA expression and the MRI data

Creating functions to compute correlation and p-value, created by ChatGPT
```{r}
correlation_with_pvalue_pearson <- function(x, y) {
  test <- cor.test(x, y, method = "pearson")
  return(data.frame(correlation = test$estimate, p_value = test$p.value))
}

correlation_with_pvalue_spearman <- function(x, y) {
  test <- cor.test(x, y, method = "spearman")
  return(data.frame(correlation = test$estimate, p_value = test$p.value))
}
```

Running the correlation analysis
```{r}
# Group by gene_symbol and compute correlation between AHBA_expression and Estimate
cor_results_Estimate_pearson <- AHBA_MRI_merged_organized %>%
  group_by(gene_symbol) %>%
  summarize(correlation = cor(AHBA_expression, Estimate, method = "pearson"),
            p_value = correlation_with_pvalue_pearson(AHBA_expression, Estimate)$p_value) %>%
  mutate(in_16p_locus = ifelse(gene_symbol %in% sixteen_p_11.2_genes, "Yes", "No"))

#Trying a spearman correlation analysis instead
cor_results_Estimate_spearman <- AHBA_MRI_merged_organized %>%
  group_by(gene_symbol) %>%
  summarize(correlation = cor(AHBA_expression, Estimate, method = "spearman"),
            p_value = cor.test(AHBA_expression, Estimate, method = "spearman")$p.value) %>%
  mutate(in_16p_locus = ifelse(gene_symbol %in% sixteen_p_11.2_genes, "Yes", "No"))

# Doing the same thing for the t value
# Pearson correlation
cor_results_t_value_pearson <- AHBA_MRI_merged_organized %>%
  group_by(gene_symbol) %>%
  summarize(correlation = cor(AHBA_expression, t_value, method = "pearson"),
            p_value = correlation_with_pvalue_pearson(AHBA_expression, t_value)$p_value) %>%
  mutate(in_16p_locus = ifelse(gene_symbol %in% sixteen_p_11.2_genes, "Yes", "No"))

#Spearman correlation
cor_results_t_value_spearman <- AHBA_MRI_merged_organized %>%
  group_by(gene_symbol) %>%
  summarize(correlation = cor(AHBA_expression, t_value, method = "spearman"),
            p_value = cor.test(AHBA_expression, t_value, method = "spearman")$p.value) %>%
  mutate(in_16p_locus = ifelse(gene_symbol %in% sixteen_p_11.2_genes, "Yes", "No"))
```


Exporting the correlation analysis dataframes to CSV files
```{r}
setwd("/Users/tammyray/Desktop/aaron_lab_2024/CSV_data_sheets")
write_csv(cor_results_Estimate_pearson, "AHBA_MRI_correlation_Estimate_Pearson.csv")
write_csv(cor_results_Estimate_spearman, "AHBA_MRI_correlation_Estimate_Spearman.csv")
write_csv(cor_results_t_value_pearson, "AHBA_MRI_correlation_t_value_Pearson.csv")
write_csv(cor_results_t_value_spearman, "AHBA_MRI_correlation_t_value_Spearman.csv")
```


Now, I want to visualize the correlation data
```{r}
#Scatterplot showing relationship between p value and correlation, not super useful but educational for me!
ggplot(cor_results_t_value_spearman, aes(x=p_value, y=correlation, color=in_16p_locus)) +
  geom_point() +
  labs(title = "Spearman correlation of t-value",
       x = "p value",
       y = "correlation") +
  theme_minimal() + 
  geom_vline(aes(xintercept = 0.05))

#making a density curve instead?
ggplot(cor_results_t_value_spearman, aes(x=correlation, color=in_16p_locus)) +
  geom_density() +
  labs(title = "Spearman correlation of t-value",
       x = "Correlation Coefficient",
       y = "Density") +
  theme_minimal()

# making a histogram instead
ggplot(cor_results_t_value_spearman, aes(x= abs(correlation), fill=in_16p_locus)) +
  geom_histogram(position = "identity", alpha = 0.6) +
  labs(title = "Spearman correlation of t-value",
       x = "Absolute Value of Correlation Coefficient",
       y = "Frequency") +
  theme_minimal() +
  xlim(0, 0.6) + ylim(0, 8)

ggplot(cor_results_Estimate_spearman, aes(x=abs(correlation), fill=in_16p_locus)) +
  geom_histogram(position = "identity", alpha = 0.6) +
  labs(title = "Spearman correlation of Estimate(~effect size)",
       x = "Absolute Value of Correlation Coefficient",
       y = "Frequency") +
  theme_minimal() +
  xlim(0, 0.6) + ylim(0, 8)
```

Now I want to plot the data
```{r}
library("ggpubr")
library(plotly)
```

```{r}
#Making a huge plot with a lot of little plots inside (courtesy of facet_wrap) of all genes
scatter_plot_estimate <- ggplot(AHBA_MRI_merged_organized, aes(x = AHBA_expression, y = Estimate, color = region)) +
  geom_point(size = 0.1) + 
  geom_smooth(method = "lm", se = FALSE, color = "blue", linewidth = 0.5) +  # Add a line of best fit (linear regression)
  labs(title = "Scatter plots of AHBA Expression vs Estimate for all Genes",
       x = "AHBA Expression",
       y = "Estimate") +
  facet_wrap(~ gene_symbol, scales = "free") +  # Separate plots for each gene
  theme_minimal() +
  xlim(0, 1) + ylim(-0.7, 0.7) #+
  stat_cor(aes(label = paste(..r.label.., sep = "~`,`~")), 
         method = "pearson", label.x = 3, label.y = 0)  # Add correlation coefficients to the plot

# Convert to a plotly object
interactive_plot_estimate <- ggplotly(scatter_plot_estimate, tooltip = c("text"))
interactive_plot_estimate # Display the interactive plot

#Doing the same plot, but for t-value
scatter_plot_t_value <- ggplot(AHBA_MRI_merged_organized, aes(x = AHBA_expression, y = Estimate, color = region)) +
  geom_point(size = 0.1) + 
  geom_smooth(method = "lm", se = FALSE, color = "blue", linewidth = 0.5) +  # Add a line of best fit (linear regression)
  labs(title = "Scatter plots of AHBA Expression vs Estimate for all Genes",
       x = "AHBA Expression",
       y = "Estimate") +
  facet_wrap(~ gene_symbol, scales = "free") +  # Separate plots for each gene
  theme_minimal() +
  stat_cor(aes(label = paste(..r.label.., sep = "~`,`~")), 
           method = "pearson", label.x = 3, label.y = 0)  # Add correlation coefficients to the plot

# Convert to a plotly object
interactive_plot_t_value <- ggplotly(scatter_plot_t_value, tooltip = c("text"))
interactive_plot_t_value # Display the interactive plot
```


Doing the above plotting for the 16p genes only
```{r}
# Doing the above plotting for the NON 16p genes ----
AHBA_MRI_merged_organized_NOT_IN_16p_locus <- AHBA_MRI_merged_organized %>% filter(in_16p_region == "No")

scatter_plot_NOT_16p_genes <- ggplot(AHBA_MRI_merged_organized_NOT_IN_16p_locus, aes(x = AHBA_expression, y = Estimate, color = region)) +
  geom_point(size = 0.1) + 
  geom_smooth(method = "lm", se = FALSE, color = "blue", linewidth = 0.5, fullrange = TRUE) +  # Add a line of best fit (linear regression)
  labs(title = "Scatter plots of AHBA Expression vs Estimate for Genes NOT in 16p locus",
       x = "AHBA Expression",
       y = "Estimate") +
  facet_wrap(~ gene_symbol, scales = "free") +  # Separate plots for each gene
  theme_minimal() +
  stat_cor(aes(label = paste(..r.label.., sep = "~`,`~")), 
           method = "pearson", label.x = 3, label.y = 0) + # Add correlation coefficients to the plot
  xlim(0, 1)  # Adjust axis limits
# Convert to a plotly object
interactive_plot_NOT_16p_genes <- ggplotly(scatter_plot_NOT_16p_genes, tooltip = c("text"))

# Display the interactive plot
interactive_plot_NOT_16p_genes
```


Adding correlation coefficiencies and p values to plots
1. Making new dataframw with facet labels
```{r}
# Merge main data with correlation results
plot_data_t_value_spearman <- AHBA_MRI_merged_organized %>%
  left_join(cor_results_t_value_spearman, by = "gene_symbol") %>% # Merge by gene_symbol
  mutate(facet_label = paste0(gene_symbol, "\n", 
                              "| R: ", round(correlation, 2), 
                              ", p: ", round(p_value, 3)))
```


2. replotting data
```{r}
ggplot(plot_data_t_value_spearman, aes(x = AHBA_expression, y = t_value, color = region)) +
  geom_point(size = 0.1) + 
  geom_smooth(method = "lm", se = FALSE, color = "blue", linewidth = 0.5, fullrange = TRUE) +  # Add a line of best fit (linear regression)
  labs(title = "Scatter plots of AHBA Expression vs t-value for all Genes",
       x = "AHBA Expression",
       y = "t-value") +
  facet_wrap(~ facet_label, scales = "free") +  # Use the custom facet label
  theme_minimal() +
  xlim(0, 1) #+ ylim(-0.7, 0.7)  # Adjust axis limits
```







